In [36]:
% matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import itertools, powerlaw
from collections import Counter
import seaborn as sns
sns.set(style="white",rc={"figure.figsize": (6, 6)})

# fdir = 'E:/Dropbox/Hacking/Baseball/'
fdir = '/Users/brianckeegan/Dropbox/Hacking/Baseball/'
figpath = '/Users/brianckeegan/Dropbox/Papers/Network Science/Baseball/'

Batting


In [2]:
batting_df = pd.read_csv(fdir + 'Lahman/Batting.csv')
batting_df.tail()


Out[2]:
playerID yearID stint teamID lgID G G_batting AB R H ... SB CS BB SO IBB HBP SH SF GIDP G_old
97884 zimmejo02 2013 1 WAS NL 32 32 65 4 8 ... 0 0 1 20 0 0 6 1 0 NaN
97885 zimmery01 2013 1 WAS NL 147 147 568 84 156 ... 6 0 60 133 2 2 0 3 16 NaN
97886 zitoba01 2013 1 SFN NL 30 30 34 3 5 ... 0 0 0 8 0 0 9 0 1 NaN
97887 zobribe01 2013 1 TBA AL 157 157 612 77 168 ... 11 3 72 91 4 7 1 6 18 NaN
97888 zuninmi01 2013 1 SEA AL 52 52 173 22 37 ... 1 0 16 49 0 3 0 1 5 NaN

5 rows × 24 columns

Teams


In [3]:
teams_df = pd.read_csv(fdir + 'Lahman/Teams.csv')
teams_df.tail()


Out[3]:
yearID lgID teamID franchID divID Rank G Ghome W L ... DP FP name park attendance BPF PPF teamIDBR teamIDlahman45 teamIDretro
2740 2013 NL LAN LAD W 1 162 81 92 70 ... 160 0.982 Los Angeles Dodgers Dodger Stadium 3743527 95 95 LAD LAN LAN
2741 2013 NL ARI ARI W 2 162 81 81 81 ... 134 0.988 Arizona Diamondbacks Chase Field 2134795 102 102 ARI ARI ARI
2742 2013 NL SDN SDP W 3 162 81 76 86 ... 140 0.986 San Diego Padres Petco Park 2166691 91 91 SDP SDN SDN
2743 2013 NL SFN SFG W 4 162 82 76 86 ... 126 0.982 San Francisco Giants AT&T Park 3326796 90 89 SFG SFN SFN
2744 2013 NL COL COL W 5 162 81 74 88 ... 162 0.986 Colorado Rockies Coors Field 2793828 117 118 COL COL COL

5 rows × 48 columns

Game logs


In [4]:
gl_2013 = pd.read_csv(fdir + 'GameLogs/GL2013.TXT',header=None)
gl_2013.head()


Out[4]:
0 1 2 3 4 5 6 7 8 9 ... 151 152 153 154 155 156 157 158 159 160
0 20130331 0 Sun TEX AL 1 HOU AL 1 2 ... Matt Dominguez 5 barnb002 Brandon Barnes 9 ceder002 Ronny Cedeno 6 NaN Y
1 20130401 0 Mon KCA AL 1 CHA AL 1 0 ... Alexei Ramirez 6 flowt001 Tyler Flowers 2 beckg001 Gordon Beckham 4 NaN Y
2 20130401 0 Mon DET AL 1 MIN AL 1 4 ... Chris Parmelee 9 dozib001 Brian Dozier 4 florp001 Pedro Florimon 6 NaN Y
3 20130401 0 Mon BOS AL 1 NYA AL 1 8 ... Ichiro Suzuki 9 nix-j001 Jayson Nix 5 cervf001 Francisco Cervelli 2 NaN Y
4 20130401 0 Mon SEA AL 1 OAK AL 1 2 ... Joshua Donaldson 5 smits002 Seth Smith 10 sogae001 Eric Sogard 4 NaN Y

5 rows × 161 columns


In [5]:
home_col = [3] + np.arange(105,132,3).tolist()
away_col = [6] + np.arange(132,159,3).tolist()

In [6]:
# Get the lineups of all the home team lineups
home_df = gl_2013[home_col]

# Rename the columns so the concatenate and melt nicer
home_df.columns = ['Team'] + np.arange(1,10).tolist()

# Get all the lineups of the away team lineups
away_df = gl_2013[away_col]

# Rename the columns so the concatenate and melt nicer
away_df.columns = ['Team'] + np.arange(1,10).tolist()

# Combine the home and away lineups
all_games_df = pd.concat([home_df,away_df])

# Big nasty operation that melts wide format into long format, 
# removes duplicate entries, groups by playerID, 
# aggregates into a list of teams played, then converts to a dictionary.
# This dictionary is keyed by playerID and returns a list of all the 
# teams of which the player was a member
affiliations_by_player = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('value').agg({'Team':lambda x:list(x)}).to_dict()['Team']
affiliations_by_team = pd.melt(all_games_df,id_vars='Team',value_vars=np.arange(1,10).tolist()).drop('variable',1).drop_duplicates().groupby('Team').agg({'value':lambda x:list(x)}).to_dict()['value']

2013 analysis


In [7]:
# Get all the home and away lineups, create a list of combinations of these names 
# reflecting teammates who've played together, then count how often these edges occur
home = Counter([j for i in gl_2013[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
away = Counter([j for i in gl_2013[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])

# Add the home and away Counter objects together to get all games played together
games = home + away

# Create a network object
g2013 = nx.Graph()

# Iterate over the edges in the games Counter object adding edges, 
# also add node properties from affiliations
for (p1,p2),c in games.iteritems():
    g2013.add_edge(p1,p2,weight=c)
    g2013.add_node(p1,teams=affiliations_by_player[p1])
    g2013.add_node(p2,teams=affiliations_by_player[p2])

In [8]:
# http://orange.biolab.si/blog/2012/06/15/joint-entropy-in-python/
def entropy2(*X):
    return np.sum(-p * np.log2(p) if p > 0 else 0 for p in
        (np.mean(reduce(np.logical_and, (predictions == c for predictions, c in zip(X, classes))))
            for classes in itertools.product(*[set(x) for x in X])))

In [9]:
def ei_index(g,affiliations,team):
    team_graph = g.subgraph(affiliations[team])
    non_team_edges = [(node,neighbor,d) for node in team_graph for neighbor,d in g[node].items() if neighbor not in team_graph.nodes()]
    return (len(non_team_edges) - len(team_graph.edges()))/float(len(non_team_edges) + len(team_graph.edges()))

def weight_entropy(g,affiliations,team):
    weights = [float(d['weight']) for n1,n2,d in g.subgraph(affiliations[team]).edges(data=True)]
    return entropy2(np.array(weights)/float(sum(weights)))

def strength_entropy(g,affiliations,team):
    subgraph = g.subgraph(affiliations[team])
    strengths = [sum(subgraph[node][neighbor]['weight'] for neighbor in subgraph[node]) for node in subgraph.nodes()]
    return entropy2(np.array(strengths)/float(sum(strengths)))

In [10]:
team_network_statistics = pd.DataFrame(index=affiliations_by_team.keys())

team_network_statistics['Nodes'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_nodes() for team in affiliations_by_team.keys()})
team_network_statistics['Edges'] = pd.Series({team:g2013.subgraph(affiliations_by_team[team]).number_of_edges() for team in affiliations_by_team.keys()})
team_network_statistics['Density'] = pd.Series({team:nx.density(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Weighted_Clustering'] = pd.Series({team:nx.average_clustering(g2013.subgraph(affiliations_by_team[team]),weight='weight') for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Connectivity'] = pd.Series({team:np.mean(nx.degree_centrality(g2013.subgraph(affiliations_by_team[team])).values()) for team in affiliations_by_team.keys()})
team_network_statistics['Diameter'] = pd.Series({team:nx.diameter(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Radius'] = pd.Series({team:nx.radius(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['Avg_Shortest_path'] = pd.Series({team:nx.average_shortest_path_length(g2013.subgraph(affiliations_by_team[team])) for team in affiliations_by_team.keys()})
team_network_statistics['EI_index'] = pd.Series({team:ei_index(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Weight_entropy'] = pd.Series({team:weight_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})
team_network_statistics['Strength_entropy'] = pd.Series({team:strength_entropy(g2013,affiliations_by_team,team) for team in affiliations_by_team.keys()})

In [11]:
team_joined_df = team_network_statistics.join(teams_df[teams_df['yearID'] == 2013].set_index('teamID'))
team_joined_df['BA'] = team_joined_df['H'] / team_joined_df['AB']
team_joined_df['SLG'] = (team_joined_df['H'] + 2*team_joined_df['2B'] + 3*team_joined_df['3B'] + 4*team_joined_df['HR']) / team_joined_df['AB']
team_joined_df


Out[11]:
Nodes Edges Density Avg_Clustering Avg_Weighted_Clustering Avg_Connectivity Diameter Radius Avg_Shortest_path EI_index ... name park attendance BPF PPF teamIDBR teamIDlahman45 teamIDretro BA SLG
MIL 31 319 0.686022 0.847033 0.065738 0.686022 2 1 1.313978 -0.893175 ... Milwaukee Brewers Miller Park 2531105 105 105 MIL ML4 MIL 0.252284 0.477530
MIN 27 247 0.703704 0.863261 0.073799 0.703704 2 1 1.296296 -0.588424 ... Minnesota Twins Target Field 2477644 99 101 MIN MIN MIN 0.241912 0.460999
MIA 37 402 0.603604 0.805358 0.079482 0.603604 2 1 1.396396 -0.923445 ... Miami Marlins Marlins Park 1586322 102 103 MIA FLO MIA 0.230685 0.397871
ATL 30 301 0.691954 0.849447 0.064868 0.691954 2 1 1.308046 -0.750000 ... Atlanta Braves Turner Field 2548679 104 103 ATL ATL ATL 0.248851 0.484286
BOS 27 239 0.680912 0.855273 0.074903 0.680912 2 1 1.319088 -0.397661 ... Boston Red Sox Fenway Park II 2833333 102 102 BOS BOS BOS 0.277119 0.546983
DET 24 197 0.713768 0.860695 0.082192 0.713768 2 1 1.286232 -0.690987 ... Detroit Tigers Comerica Park 3083397 106 105 DET DET DET 0.283348 0.519965
CIN 25 231 0.770000 0.858601 0.078578 0.770000 2 1 1.230000 -1.000000 ... Cincinnati Reds Great American Ball Park 2534369 103 102 CIN CIN CIN 0.249136 0.472450
NYN 37 440 0.660661 0.814347 0.063154 0.660661 2 1 1.339339 -0.577061 ... New York Mets Citi Field 2135657 94 95 NYM NYN NYN 0.237093 0.442526
BAL 33 274 0.518939 0.835306 0.045051 0.518939 2 1 1.481061 -0.513812 ... Baltimore Orioles Oriole Park at Camden Yards 2357561 106 105 BAL BAL BAL 0.259786 0.524199
NYA 39 394 0.531714 0.812334 0.049769 0.531714 2 1 1.468286 -0.340136 ... New York Yankees Yankee Stadium III 3279589 102 101 NYY NYA NYA 0.242430 0.452010
COL 31 345 0.741935 0.854358 0.093368 0.741935 2 1 1.258065 -0.582569 ... Colorado Rockies Coors Field 2793828 117 118 COL COL COL 0.269870 0.503840
OAK 29 262 0.645320 0.841020 0.062386 0.645320 2 1 1.354680 -0.435616 ... Oakland Athletics O.co Coliseum 1809302 95 93 OAK OAK OAK 0.254121 0.511502
TEX 27 233 0.663818 0.853693 0.059061 0.663818 2 1 1.336182 -0.618056 ... Texas Rangers Rangers Ballpark in Arlington 3178273 104 103 TEX TEX TEX 0.262310 0.494539
TOR 28 244 0.645503 0.839040 0.074596 0.645503 2 2 1.354497 -0.787546 ... Toronto Blue Jays Rogers Centre 2536562 102 102 TOR TOR TOR 0.252483 0.497742
SEA 32 299 0.602823 0.830240 0.058128 0.602823 2 1 1.397177 -0.607527 ... Seattle Mariners Safeco Field 1761546 92 92 SEA SEA SEA 0.237136 0.471213
PIT 34 375 0.668449 0.831317 0.051754 0.668449 2 1 1.331551 -0.488095 ... Pittsburgh Pirates PNC Park 2256862 94 94 PIT PIT PIT 0.245170 0.481225
CHA 29 250 0.615764 0.831665 0.060447 0.615764 2 2 1.384236 -0.501502 ... Chicago Cubs Wrigley Field 2642682 104 105 CHC CHN CHN 0.248966 0.450836
CLE 26 203 0.624615 0.842396 0.072385 0.624615 2 1 1.375385 -0.353333 ... Cleveland Indians Progressive Field 1572926 93 94 CLE CLE CLE 0.254529 0.498445
PHI 35 383 0.643697 0.817558 0.053483 0.643697 2 2 1.356303 -0.445283 ... Philadelphia Phillies Citizens Bank Park 3012403 101 102 PHI PHI PHI 0.248350 0.462060
CHN 34 372 0.663102 0.815573 0.049934 0.663102 2 1 1.336898 -0.433526 ... Chicago White Sox U.S. Cellular Field 1768413 107 107 CHW CHA CHA 0.237723 0.480720
SLN 28 248 0.656085 0.843795 0.079436 0.656085 2 1 1.343915 -1.000000 ... St. Louis Cardinals Busch Stadium III 3369769 99 97 STL SLN SLN 0.268850 0.485514
HOU 30 301 0.691954 0.858215 0.051790 0.691954 2 1 1.308046 -0.618280 ... Houston Astros Minute Maid Park 1651883 99 101 HOU HOU HOU 0.239509 0.454279
WAS 31 307 0.660215 0.828591 0.057780 0.660215 2 1 1.339785 -0.691460 ... Washington Nationals Nationals Park 2652422 102 101 WSN MON WAS 0.251104 0.479765
LAN 34 390 0.695187 0.837954 0.058322 0.695187 3 2 1.306595 -0.614907 ... Los Angeles Dodgers Dodger Stadium 3743527 95 95 LAD LAN LAN 0.263522 0.475687
KCA 28 244 0.645503 0.837996 0.050331 0.645503 2 1 1.354497 -0.280840 ... Kansas City Royals Kauffman Stadium 1750754 103 103 KCR KCA KCA 0.260047 0.450712
ANA 25 223 0.743333 0.866293 0.069387 0.743333 2 1 1.256667 -0.452769 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
SFN 32 338 0.681452 0.835695 0.058266 0.681452 2 1 1.318548 -0.802667 ... San Francisco Giants AT&T Park 3326796 90 89 SFG SFN SFN 0.260447 0.457313
TBA 26 209 0.643077 0.847691 0.058350 0.643077 2 1 1.356923 -0.583333 ... Tampa Bay Rays Tropicana Field 1510300 96 95 TBR TBA TBA 0.256591 0.495125
ARI 30 317 0.728736 0.844379 0.122853 0.728736 2 1 1.271264 -0.837681 ... Arizona Diamondbacks Chase Field 2134795 102 102 ARI ARI ARI 0.258633 0.473044
SDN 34 376 0.670232 0.822551 0.089665 0.670232 2 2 1.329768 -0.534694 ... San Diego Padres Petco Park 2166691 91 91 SDP SDN SDN 0.244517 0.453689

30 rows × 61 columns


In [108]:
team_joined_df.columns


Out[108]:
Index([u'Nodes', u'Edges', u'Density', u'Avg_Clustering', u'Avg_Weighted_Clustering', u'Avg_Connectivity', u'Diameter', u'Radius', u'Avg_Shortest_path', u'yearID', u'lgID', u'franchID', u'divID', u'Rank', u'G', u'Ghome', u'W', u'L', u'DivWin', u'WCWin', u'LgWin', u'WSWin', u'R', u'AB', u'H', u'2B', u'3B', u'HR', u'BB', u'SO', u'SB', u'CS', u'HBP', u'SF', u'RA', u'ER', u'ERA', u'CG', u'SHO', u'SV', u'IPouts', u'HA', u'HRA', u'BBA', u'SOA', u'E', u'DP', u'FP', u'name', u'park', u'attendance', u'BPF', u'PPF', u'teamIDBR', u'teamIDlahman45', u'teamIDretro', u'BA', u'SLG'], dtype='object')

In [175]:
sns.lmplot('Avg_Clustering','SLG',team_joined_df,hue='lgID')
plt.xlabel('Average team clustering')
plt.ylabel('Slugging percentage')
plt.tight_layout()
plt.savefig('cluster-SLG.png')



In [176]:
sns.lmplot(u'Strength_entropy','W',team_joined_df,hue='lgID')
plt.xlabel('Within team strength entropy')
plt.ylabel('Wins')
plt.tight_layout()
plt.savefig('strength_entropy-wins.png')


Annual analyses


In [12]:
total_edgelist = Counter()
yearly_graphlist = list()

for year in np.arange(1914,2014):
    gl = pd.read_csv(fdir + 'GameLogs/GL{0}.TXT'.format(str(year)),header=None)
    home = Counter([j for i in gl[np.arange(105,132,3)].values.tolist() for j in list(itertools.combinations(i,2))])
    away = Counter([j for i in gl[np.arange(132,159,3)].values.tolist() for j in list(itertools.combinations(i,2))])
    games = home + away
    total_edgelist += games
    
    g = nx.Graph()
    for (p1,p2),c in games.iteritems():
        g.add_edge(p1,p2,weight=c)
    yearly_graphlist.append(g)

In [97]:
nx.write_gexf(yearly_graphlist[-2],'2012.gexf')

In [80]:
annual_network_stats = pd.DataFrame(index=np.arange(1914,2014))
annual_network_stats['Nodes'] = [g.number_of_nodes() for g in yearly_graphlist]
annual_network_stats['Edges'] = [g.number_of_edges() for g in yearly_graphlist]
annual_network_stats['Density'] = [nx.density(g) for g in yearly_graphlist]
annual_network_stats['Avg_Clustering'] = [nx.average_clustering(g) for g in yearly_graphlist]
annual_network_stats['Avg_Connectivity'] = [np.mean(nx.degree_centrality(g).values())*(len(g)-1) for g in yearly_graphlist]
#annual_network_stats['Rich_Club'] = [nx.rich_club_coefficient(g) for g in yearly_graphlist]
annual_network_stats['Components'] = [nx.number_connected_components(g) for g in yearly_graphlist]
annual_network_stats['Exponent_Centrality'] = [powerlaw.Fit([int(i*(len(g)-1)) for i in nx.degree_centrality(g).values()],xmin=8,discrete=True).power_law.alpha for g in yearly_graphlist]
annual_network_stats['Exponent_Weight'] = [powerlaw.Fit([d['weight'] for n1,n2,d in g.edges_iter(data=True)],discrete=True).power_law.alpha for g in yearly_graphlist]

diameters = list()
radiuses = list()
avg_shortest_path = list()
frac_lcc = list()

for g in yearly_graphlist:
    lcc = nx.connected_component_subgraphs(g)[0]
    diameters.append(nx.diameter(lcc))
    radiuses.append(nx.radius(lcc))
    avg_shortest_path.append(nx.average_shortest_path_length(lcc))
    frac_lcc.append(len(lcc)/float(len(g)))
    
annual_network_stats['Diameter'] = diameters
annual_network_stats['Radius'] = radiuses
annual_network_stats['Avg_Shortest_Path'] = avg_shortest_path
annual_network_stats['Frac_LCC'] = frac_lcc

annual_network_stats.to_csv('1914-2014.csv')
annual_network_stats.head()


Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Out[80]:
Nodes Edges Density Avg_Clustering Avg_Connectivity Components Exponent_Centrality Exponent_Weight Diameter Radius Avg_Shortest_Path Frac_LCC
1914 652 6516 0.030703 0.787874 19.987730 4 2.105651 2.424115 9 5 4.334811 0.831288
1915 662 6571 0.030033 0.780841 19.851964 3 2.135761 2.366330 12 6 5.217547 0.922961
1916 458 4689 0.044805 0.787593 20.475983 3 2.097455 2.458791 12 6 5.158481 0.893013
1917 438 4298 0.044910 0.801067 19.625571 4 2.126881 2.345605 12 6 4.918967 0.840183
1918 427 3869 0.042539 0.806555 18.121780 6 2.237536 2.233264 12 6 5.257413 0.700234

Charts


In [88]:
ax = annual_network_stats['Nodes'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Players')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'nodes.pdf')



In [89]:
ax = annual_network_stats['Density'].plot(lw=4)
ax.set_ylabel('Density')
ax.set_xlabel('Year')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'density.pdf')



In [90]:
ax = annual_network_stats['Avg_Clustering'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average Clustering')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'clustering.pdf')



In [91]:
ax = annual_network_stats['Avg_Shortest_Path'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average shortest path')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'shortest_path.pdf')



In [92]:
ax = annual_network_stats['Frac_LCC'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('LCC node fraction')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'frac_lcc.pdf')



In [93]:
ax = annual_network_stats['Avg_Connectivity'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Average degree')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'degree.pdf')



In [94]:
ax = annual_network_stats['Exponent_Centrality'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, centrality')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.ylim((1.6,2.4))
plt.savefig(figpath+'exponent_centrality.pdf')



In [95]:
ax = annual_network_stats['Exponent_Weight'].plot(lw=4)
ax.set_xlabel('Year')
ax.set_ylabel('Power law exponent, weight')
plt.xlim((1914,2014))
# Strike years
plt.axvline(1972,c='r',linestyle='--')
plt.axvline(1981,c='r',linestyle='--')
plt.axvline(1985,c='r',linestyle='--')
plt.axvline(1994,c='r',linestyle='--')
# Expansion years
plt.axvline(1969,c='g',linestyle='--')
plt.axvline(1977,c='g',linestyle='--')
plt.axvline(1993,c='g',linestyle='--')
plt.axvline(1998,c='g',linestyle='--')
plt.tight_layout()
plt.savefig(figpath+'exponent_weight.pdf')


Whole graph analyses


In [16]:
g_all = nx.Graph()
for (p1,p2),c in total_edgelist.iteritems():
    g_all.add_edge(p1,p2,weight=c)

In [49]:
centralities = [int(i*(len(g_all)-1)) for i in nx.degree_centrality(g_all).values()]
degree_counter = Counter([i for i in centralities if i >= 8])

plt.scatter(degree_counter.keys(),degree_counter.values(),c='r')

#plt.ylabel('Count')
plt.yscale('log')
plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('degree_distribution.png')
plt.savefig(figpath+'degree_distribution.pdf')



In [51]:
weights = [d['weight'] for n1,n2,d in g_all.edges_iter(data=True)]
weight_counter = Counter(weights)

plt.scatter(weight_counter.keys(),weight_counter.values(),c='b')

#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('weight_distribution.png')
plt.savefig(figpath+'weight_distribution.pdf')



In [56]:
strengths = [sum(g_all[node][neighbor]['weight'] for neighbor in g_all[node]) for node in g_all.nodes()]
strength_counter = Counter(strengths)

plt.scatter(strength_counter.keys(),strength_counter.values(),c='g')

#plt.ylabel('Count')
plt.yscale('log')
#plt.ylim((.9,1e3))
#plt.xlabel('Observation')
plt.xscale('log')

plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('strength_distribution.png')
plt.savefig(figpath+'strength_distribution.pdf')



In [35]:
centralities_pl_alpha = powerlaw.Fit(centralities,xmin=8,discrete=True).power_law.alpha

weights_pl_alpha = powerlaw.Fit(degrees,discrete=True).power_law.alpha

print centralities_pl_alpha, weights_pl_alpha


Calculating best minimal value for power law fit
1.58946237539 3.2291248836

In [61]:
def extract_backbone(g, alpha):
  backbone_graph = nx.Graph()
  for node in g:
      k_n = len(g[node])
      if k_n > 1:
          sum_w = sum( g[node][neighbor]['weight'] for neighbor in g[node] )
          for neighbor in g[node]:
              edgeWeight = g[node][neighbor]['weight']
              pij = float(edgeWeight)/sum_w
              if (1-pij)**(k_n-1) < alpha: # equation 2
                  backbone_graph.add_edge( node,neighbor, weight = edgeWeight)
  return backbone_graph

In [65]:
g_backbone = extract_backbone(g,.125)
nx.write_gexf(g_backbone,'1995-2013_backbone.gexf')
g.number_of_edges(), g_backbone.number_of_edges()


Out[65]:
(111309, 21387)

In [ ]: